Proteomics dataset

This is analysis for proteomics dataset generated using mouse scraped colon epithelium samples from Fabp-Cre;KRasWT and Fabp-Cre;KRasG12D mice. Emily Poulin harvested the tissue samples and the proteomics were performed by Joao Paulo. Part of the analysis code was adapted from original script from Shikha Sheth.

Library loading and set up

options(connectionObserver = NULL)

suppressMessages(
  c(library(gridExtra),
    library(ensembldb),
    library(EnsDb.Mmusculus.v79),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(VennDiagram),
    library(clusterProfiler),
    library(DOSE),
    library(org.Mm.eg.db), 
    library(pathview),
    library(AnnotationDbi),
    library(tidyr),
    library(qdapRegex),
    library(gtools),
    library(ggfortify))
)
##   [1] "gridExtra"           "stats"               "graphics"           
##   [4] "grDevices"           "utils"               "datasets"           
##   [7] "methods"             "base"                "ensembldb"          
##  [10] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
##  [13] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
##  [16] "IRanges"             "S4Vectors"           "stats4"             
##  [19] "BiocGenerics"        "parallel"            "gridExtra"          
##  [22] "stats"               "graphics"            "grDevices"          
##  [25] "utils"               "datasets"            "methods"            
##  [28] "base"                "EnsDb.Mmusculus.v79" "ensembldb"          
##  [31] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
##  [34] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
##  [37] "IRanges"             "S4Vectors"           "stats4"             
##  [40] "BiocGenerics"        "parallel"            "gridExtra"          
##  [43] "stats"               "graphics"            "grDevices"          
##  [46] "utils"               "datasets"            "methods"            
##  [49] "base"                "grid"                "EnsDb.Mmusculus.v79"
##  [52] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
##  [55] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
##  [58] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
##  [61] "stats4"              "BiocGenerics"        "parallel"           
##  [64] "gridExtra"           "stats"               "graphics"           
##  [67] "grDevices"           "utils"               "datasets"           
##  [70] "methods"             "base"                "ggplot2"            
##  [73] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
##  [76] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
##  [79] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
##  [82] "IRanges"             "S4Vectors"           "stats4"             
##  [85] "BiocGenerics"        "parallel"            "gridExtra"          
##  [88] "stats"               "graphics"            "grDevices"          
##  [91] "utils"               "datasets"            "methods"            
##  [94] "base"                "lattice"             "ggplot2"            
##  [97] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [100] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [103] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [106] "IRanges"             "S4Vectors"           "stats4"             
## [109] "BiocGenerics"        "parallel"            "gridExtra"          
## [112] "stats"               "graphics"            "grDevices"          
## [115] "utils"               "datasets"            "methods"            
## [118] "base"                "reshape"             "lattice"            
## [121] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [124] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [127] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [130] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [133] "stats4"              "BiocGenerics"        "parallel"           
## [136] "gridExtra"           "stats"               "graphics"           
## [139] "grDevices"           "utils"               "datasets"           
## [142] "methods"             "base"                "mixOmics"           
## [145] "MASS"                "reshape"             "lattice"            
## [148] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [151] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [154] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [157] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [160] "stats4"              "BiocGenerics"        "parallel"           
## [163] "gridExtra"           "stats"               "graphics"           
## [166] "grDevices"           "utils"               "datasets"           
## [169] "methods"             "base"                "gplots"             
## [172] "mixOmics"            "MASS"                "reshape"            
## [175] "lattice"             "ggplot2"             "grid"               
## [178] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [181] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [184] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [187] "S4Vectors"           "stats4"              "BiocGenerics"       
## [190] "parallel"            "gridExtra"           "stats"              
## [193] "graphics"            "grDevices"           "utils"              
## [196] "datasets"            "methods"             "base"               
## [199] "RColorBrewer"        "gplots"              "mixOmics"           
## [202] "MASS"                "reshape"             "lattice"            
## [205] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [208] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [211] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [214] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [217] "stats4"              "BiocGenerics"        "parallel"           
## [220] "gridExtra"           "stats"               "graphics"           
## [223] "grDevices"           "utils"               "datasets"           
## [226] "methods"             "base"                "readr"              
## [229] "RColorBrewer"        "gplots"              "mixOmics"           
## [232] "MASS"                "reshape"             "lattice"            
## [235] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [238] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [241] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [244] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [247] "stats4"              "BiocGenerics"        "parallel"           
## [250] "gridExtra"           "stats"               "graphics"           
## [253] "grDevices"           "utils"               "datasets"           
## [256] "methods"             "base"                "dplyr"              
## [259] "readr"               "RColorBrewer"        "gplots"             
## [262] "mixOmics"            "MASS"                "reshape"            
## [265] "lattice"             "ggplot2"             "grid"               
## [268] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [271] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [274] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [277] "S4Vectors"           "stats4"              "BiocGenerics"       
## [280] "parallel"            "gridExtra"           "stats"              
## [283] "graphics"            "grDevices"           "utils"              
## [286] "datasets"            "methods"             "base"               
## [289] "VennDiagram"         "futile.logger"       "dplyr"              
## [292] "readr"               "RColorBrewer"        "gplots"             
## [295] "mixOmics"            "MASS"                "reshape"            
## [298] "lattice"             "ggplot2"             "grid"               
## [301] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [304] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [307] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [310] "S4Vectors"           "stats4"              "BiocGenerics"       
## [313] "parallel"            "gridExtra"           "stats"              
## [316] "graphics"            "grDevices"           "utils"              
## [319] "datasets"            "methods"             "base"               
## [322] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [325] "dplyr"               "readr"               "RColorBrewer"       
## [328] "gplots"              "mixOmics"            "MASS"               
## [331] "reshape"             "lattice"             "ggplot2"            
## [334] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [337] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [340] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [343] "IRanges"             "S4Vectors"           "stats4"             
## [346] "BiocGenerics"        "parallel"            "gridExtra"          
## [349] "stats"               "graphics"            "grDevices"          
## [352] "utils"               "datasets"            "methods"            
## [355] "base"                "DOSE"                "clusterProfiler"    
## [358] "VennDiagram"         "futile.logger"       "dplyr"              
## [361] "readr"               "RColorBrewer"        "gplots"             
## [364] "mixOmics"            "MASS"                "reshape"            
## [367] "lattice"             "ggplot2"             "grid"               
## [370] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [373] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [376] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [379] "S4Vectors"           "stats4"              "BiocGenerics"       
## [382] "parallel"            "gridExtra"           "stats"              
## [385] "graphics"            "grDevices"           "utils"              
## [388] "datasets"            "methods"             "base"               
## [391] "org.Mm.eg.db"        "DOSE"                "clusterProfiler"    
## [394] "VennDiagram"         "futile.logger"       "dplyr"              
## [397] "readr"               "RColorBrewer"        "gplots"             
## [400] "mixOmics"            "MASS"                "reshape"            
## [403] "lattice"             "ggplot2"             "grid"               
## [406] "EnsDb.Mmusculus.v79" "ensembldb"           "AnnotationFilter"   
## [409] "GenomicFeatures"     "AnnotationDbi"       "Biobase"            
## [412] "GenomicRanges"       "GenomeInfoDb"        "IRanges"            
## [415] "S4Vectors"           "stats4"              "BiocGenerics"       
## [418] "parallel"            "gridExtra"           "stats"              
## [421] "graphics"            "grDevices"           "utils"              
## [424] "datasets"            "methods"             "base"               
## [427] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [430] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [433] "dplyr"               "readr"               "RColorBrewer"       
## [436] "gplots"              "mixOmics"            "MASS"               
## [439] "reshape"             "lattice"             "ggplot2"            
## [442] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [445] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [448] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [451] "IRanges"             "S4Vectors"           "stats4"             
## [454] "BiocGenerics"        "parallel"            "gridExtra"          
## [457] "stats"               "graphics"            "grDevices"          
## [460] "utils"               "datasets"            "methods"            
## [463] "base"                "pathview"            "org.Mm.eg.db"       
## [466] "DOSE"                "clusterProfiler"     "VennDiagram"        
## [469] "futile.logger"       "dplyr"               "readr"              
## [472] "RColorBrewer"        "gplots"              "mixOmics"           
## [475] "MASS"                "reshape"             "lattice"            
## [478] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [481] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [484] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [487] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [490] "stats4"              "BiocGenerics"        "parallel"           
## [493] "gridExtra"           "stats"               "graphics"           
## [496] "grDevices"           "utils"               "datasets"           
## [499] "methods"             "base"                "tidyr"              
## [502] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [505] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [508] "dplyr"               "readr"               "RColorBrewer"       
## [511] "gplots"              "mixOmics"            "MASS"               
## [514] "reshape"             "lattice"             "ggplot2"            
## [517] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [520] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [523] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [526] "IRanges"             "S4Vectors"           "stats4"             
## [529] "BiocGenerics"        "parallel"            "gridExtra"          
## [532] "stats"               "graphics"            "grDevices"          
## [535] "utils"               "datasets"            "methods"            
## [538] "base"                "qdapRegex"           "tidyr"              
## [541] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [544] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [547] "dplyr"               "readr"               "RColorBrewer"       
## [550] "gplots"              "mixOmics"            "MASS"               
## [553] "reshape"             "lattice"             "ggplot2"            
## [556] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [559] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [562] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [565] "IRanges"             "S4Vectors"           "stats4"             
## [568] "BiocGenerics"        "parallel"            "gridExtra"          
## [571] "stats"               "graphics"            "grDevices"          
## [574] "utils"               "datasets"            "methods"            
## [577] "base"                "gtools"              "qdapRegex"          
## [580] "tidyr"               "pathview"            "org.Mm.eg.db"       
## [583] "DOSE"                "clusterProfiler"     "VennDiagram"        
## [586] "futile.logger"       "dplyr"               "readr"              
## [589] "RColorBrewer"        "gplots"              "mixOmics"           
## [592] "MASS"                "reshape"             "lattice"            
## [595] "ggplot2"             "grid"                "EnsDb.Mmusculus.v79"
## [598] "ensembldb"           "AnnotationFilter"    "GenomicFeatures"    
## [601] "AnnotationDbi"       "Biobase"             "GenomicRanges"      
## [604] "GenomeInfoDb"        "IRanges"             "S4Vectors"          
## [607] "stats4"              "BiocGenerics"        "parallel"           
## [610] "gridExtra"           "stats"               "graphics"           
## [613] "grDevices"           "utils"               "datasets"           
## [616] "methods"             "base"                "ggfortify"          
## [619] "gtools"              "qdapRegex"           "tidyr"              
## [622] "pathview"            "org.Mm.eg.db"        "DOSE"               
## [625] "clusterProfiler"     "VennDiagram"         "futile.logger"      
## [628] "dplyr"               "readr"               "RColorBrewer"       
## [631] "gplots"              "mixOmics"            "MASS"               
## [634] "reshape"             "lattice"             "ggplot2"            
## [637] "grid"                "EnsDb.Mmusculus.v79" "ensembldb"          
## [640] "AnnotationFilter"    "GenomicFeatures"     "AnnotationDbi"      
## [643] "Biobase"             "GenomicRanges"       "GenomeInfoDb"       
## [646] "IRanges"             "S4Vectors"           "stats4"             
## [649] "BiocGenerics"        "parallel"            "gridExtra"          
## [652] "stats"               "graphics"            "grDevices"          
## [655] "utils"               "datasets"            "methods"            
## [658] "base"

Load the proteomics dataset and statistical analysis

The original dataset was loaded.

colon_proteom <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/2015-03_HaigisMouseColon8plex_Prot.csv")
## Warning: Missing column names filled in: 'X2' [2]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `Protein Id` = col_character(),
##   X2 = col_character(),
##   `Gene Symbol` = col_character(),
##   Description = col_character(),
##   FABP_1 = col_double(),
##   FABP_2 = col_double(),
##   FABP_4 = col_double(),
##   FABP_5 = col_double(),
##   G12D_1 = col_double(),
##   G12D_2 = col_double(),
##   G12D_3 = col_double(),
##   G12D_4 = col_double()
## )

Calculate the stats for G12D/WT

# establish a new data frame for collecting stats
colon_stats <- colon_proteom[,1:3]
# Calculate the pvalue using parametric unpaired t test
p_value_list <- c()
for (i in 1:dim(colon_proteom)[1]) {
  p_value <- t.test(unlist(colon_proteom[i,9:12]), unlist(colon_proteom[i,5:8]), paired = FALSE)$p.value
  p_value_list <- c(p_value_list, p_value)
}
colon_stats <- cbind(colon_stats, p_value_list)
colnames(colon_stats)[4] <- "p_values" 

# calculate the q value using Benjamini Hochberg FDR correction
q_value_list <- p.adjust(colon_stats$p_values, method = "BH")
colon_stats <- cbind(colon_stats, q_value_list)
colnames(colon_stats)[5] <- "q_values"

# calculate fold change and log fold change
foldchange_list <- c()
for (i in 1:dim(colon_proteom)[1]) {
  foldchange <- foldchange(mean(unlist(colon_proteom[i,9:12])), mean(unlist(colon_proteom[i,5:8])))
  foldchange_list <- c(foldchange_list, foldchange)
}
logfoldchange_list <- foldchange2logratio(foldchange_list)
colon_stats <- cbind(colon_stats, foldchange_list, logfoldchange_list)
colnames(colon_stats)[6:7] <- c("foldChange", "LFC")

Now we output this statistical analysis file into a csv file.

write.csv(colon_stats, "ceMS_diff.csv", col.names = NULL)

Just to check how many siginificant proteins do we have based on p< 0.05 and q< 0.1

sig_dif_stats <- subset(colon_stats, colon_stats$p_values <= 0.05 & colon_stats$q_values <= 0.1)

dim(sig_dif_stats)[1]
## [1] 2348

So there are 2348 proteins identified to have significantly different expression between G12D/WT.

Plot PCA and hierchical clustering

PCA plot

Since the number of significant changes are quite small, I want to use PCA to check how the samples cluster.

dir.create("PDF_figure", showWarnings = FALSE)
df <- colon_proteom[5:12]
df <- as.data.frame(t(df))
df <- cbind(df, c('WT','WT','WT','WT','G12D','G12D','G12D','G12D'))
colnames(df)[8162] <- 'Genotype'
df.set <- as.matrix(df[,1:8161])
df.pca <- prcomp(df.set, center = TRUE, scale = TRUE)
autoplot(df.pca, data = df, colour = 'Genotype') +
  geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
  xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.

pdf('PDF_figure/PCA.pdf',
    height = 4,
    width = 6)
autoplot(df.pca, data = df, colour = 'Genotype') +
  geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
  xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
dev.off()
## quartz_off_screen 
##                 2

Hierchical clustering

pseudoCount = log2(colon_proteom[5:12])

# remove NA, NaN, Inf values from the dataframe
pseudoCount <- na.omit(pseudoCount)
pseudoCount <- pseudoCount[is.finite(rowSums(pseudoCount)),]

mat.dist = pseudoCount
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(10, 10 ))
 suppressMessages(dev.off())
## quartz_off_screen 
##                 2
pdf('PDF_figure/Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(7, 7))
dev.off()
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Plot heatmap, scatterplot, MA plot, and volcano plot

Heatmap

For heatmap, I will z-score all quantifications across all samples for the same protein. Heatmaps for all proteins with p<0.05 and q < 0.1 are plotted

suppressMessages(library(mosaic))
sig_index <- c()
duplicate <- c()
for (i in 1:dim(sig_dif_stats)[1]) {
  index <- grep(sig_dif_stats$`Protein Id`[i], colon_proteom$`Protein Id`)
  if (length(index) == 1) {
     sig_index <- c(sig_index, index)
  }
  else {
    duplicate <- c(duplicate, i)
  }
}
sig_count <- colon_proteom[sig_index,]
sig_dif <- cbind(sig_dif_stats[-duplicate,], sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,12:19] <- zscore(as.numeric(sig_dif[i,12:19]))
}

my_palette <- colorRampPalette(c("red", "white", "blue"))(256)
heatmap_matrix <- as.matrix(sig_dif[,12:19])

png('G12D vs WT colon proteomics.png',
    width = 600,
    height = 1400,
    res = 200,
    pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
          main = "DE proteins\nin colon epithelium\n(G12D/WT) p < 0.05 q < 0.1",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(8,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Proteins",
          cexCol = 1.5,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/Heatmap.pdf',
    width = 6,
    height = 14)
heatmap.2(heatmap_matrix,
          main = "DE proteins\nin colon epithelium\n(G12D/WT) p < 0.05 q < 0.1",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(8,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Proteins",
          cexCol = 1.5,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential G12D/WT

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
colon_stats$KrasG12D_mean <- rowMeans(log2(colon_proteom[,9:12]))
colon_stats$KrasWT_mean <- rowMeans(log2(colon_proteom[,5:8]))
ggplot(colon_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") + 
  geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT Scatter Plot")

pdf('PDF_figure/Scatter_Plot.pdf',
    width = 5,
    height = 4)
ggplot(colon_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") + 
  geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT Scatter Plot")
dev.off()
## quartz_off_screen 
##                 2
# MA plot
colon_stats$'baseMean' <- rowMeans(colon_proteom[,5:12])
colon_stats$'log2baseMean' <- log(colon_stats$`baseMean`,2)
red_subset <- subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC > 0)
blue_subset <- subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC < 0)
ggplot(colon_stats, aes(x = colon_stats$`log2baseMean`, y = colon_stats$`LFC`)) +
  xlab("Average Expression") + ylab("LFC") +
  geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT MA Plot")
## Warning: Use of `colon_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.

pdf('PDF_figure/MA_Plot.pdf',
    width = 5,
    height = 4)
ggplot(colon_stats, aes(x = colon_stats$`log2baseMean`, y = colon_stats$`LFC`)) +
  xlab("Average Expression") + ylab("LFC") +
  geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "G12D vs WT MA Plot")
## Warning: Use of `colon_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
dev.off()
## quartz_off_screen 
##                 2
# Volcano Plot
ggplot(colon_stats, aes(x = colon_stats$`LFC`, y = -log(colon_stats$`p_values`,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `colon_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.

pdf('PDF_figure/Volcano_Plot.pdf',
    width = 5,
    height = 4)
ggplot(colon_stats, aes(x = colon_stats$`LFC`, y = -log(colon_stats$`p_values`,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `colon_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.
dev.off()
## quartz_off_screen 
##                 2

GO analysis

For the Go analysis, I am using all proteins that have a p<0.05 and q < 0.1.

target_gene <- as.character(sig_dif_stats$`Protein Id`)
detected_gene <- as.character(colon_proteom$`Protein Id`)

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "UNIPROT",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
dim(cluster_summary_BP)[1]
## [1] 156
write.csv(cluster_summary_BP, "GO/GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "UNIPROT",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
dim(cluster_summary_MF)[1]
## [1] 22
write.csv(cluster_summary_MF, "GO/GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "UNIPROT",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
dim(cluster_summary_CC)[1]
## [1] 24
write.csv(cluster_summary_CC, "GO/GO analysis_CC.csv")

Draw Dotplot representing the results

Biological Process

png('GO/GO dotplot_BP.png',
    width = 2000,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO dotplot_BP.pdf',
    width = 20,
    height = 16)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_dotplot #### Molecular Function

png('GO/GO dotplot_MF.png',
    width = 800,
    height = 800,
    res = 100,
    pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO dotplot_MF.pdf',
    width = 8,
    height = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_dotplot #### Cellular Component

png('GO/GO dotplot_CC.png',
    width = 800,
    height = 800,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO dotplot_CC.pdf',
    width = 8,
    height = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_dotplot

Draw enrichment Go plot representing the results

Biological Process

png('GO/GO enrichment_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO enrichment_BP.pdf',
    width = 16,
    height = 16)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_enrichplot #### Molecular Function

png('GO/GO enrichment_MF.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO enrichment_MF.pdf',
    width = 10,
    height = 10)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_enrichplot #### Cellular Component

png('GO/GO enrichment_CC.png',
    width = 1000,
    height = 1000,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO enrichment_CC.pdf',
    width = 10,
    height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_enrichplot

Draw category netplot representing the results

The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color).

Biological Process

OE_foldchanges <- colon_stats$LFC
names(OE_foldchanges) <- colon_stats$`Gene Symbol`
png('GO/GO cnetplot_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO cnetplot_BP.pdf',
    width = 16,
    height = 16)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_cnetplot #### Molecular Function

png('GO/GO cnetplot_MF.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO cnetplot_MF.pdf',
    width = 16,
    height = 16)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_cnetplot #### Cellular Component

png('GO/GO cnetplot_CC.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2
pdf('PDF_figure/GO cnetplot_CC.pdf',
    width = 16,
    height = 16)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_cnetplot

GSEA

First I need to annotate quantification file with Ensembl ID.

# annotate the tumor stats with gene symbol
annotations_edb <- AnnotationDbi::select(org.Mm.eg.db,
                                           keys = colon_proteom$`Protein Id`,
                                           columns = c("ENSEMBL"),
                                           keytype = "UNIPROT")
## 'select()' returned 1:many mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_edb$UNIPROT) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_edb$ENSEMBL) == FALSE)
non_duplicates_idx <- which(is.na(annotations_edb$ENSEMBL) == FALSE)

# Return only the non-duplicated genes using indices
annotations_edb <- annotations_edb[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_edb$ENSEMBL) %>%
  which() %>%
  length()
## [1] 0
# Join the Ensembl annotation to proteomics quantification
colon_proteom <- inner_join(colon_proteom, annotations_edb, by=c("Protein Id"="UNIPROT"))

write.csv(colon_proteom, "GSEA/Raw Quantification.csv")